from bs4 import BeautifulSoup

import requests
import pandas as pd
import re
import time
# 爬虫 爬取58同城的网页信息
data = pd.DataFrame()
for page in range(1,31):
  time.sleep(1)
  #demo = open('test.html',encoding='utf-8')
  # url = 'https://yb.58.com/ershoufang/pn{}/?PGTID=0d30000c-0094-c209-4b1c-ddbff9b79423&ClickID=1'.format(page)
  url = 'https://sh.58.com/xinzhuanggongyequ/chuzu/?PGTID=0d3090a7-0182-3164-077c-ad38cf1a8fa9&ClickID=2'.format(page)
  headers = {
      'User-Agent':'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'}
  r = requests.get(url,headers=headers)
  demo = r.text
  soup = BeautifulSoup(demo,'lxml')
  li = soup.find_all(class_=["list-info"]) #
  pr =soup.find_all(class_=["price"])
  base = soup.find_all(class_=["baseinfo"])

  for i in range(0,len(li)):
    info_dic ={}
    info=li[i]
    info_dic['title'] = re.findall(r'tongji_label="listclick">(.+?)</a>',str(info))
    text = re.findall(r'<span>(.+?)</span>',str(info))
    info_dic['type'] = text[0]
    info_dic['area'] = text[1]
    info_dic['toward'] = text[2]
    info_dic['floor'] = text[3]
    info = pr[i]
    info_dic['total_price'] = re.findall('class="sum"><b>(.+?)</b>',str(info))
    info_dic['unit_price'] = re.findall('class="unit">(.+?)元/㎡', str(info))
    info = base[2 * i + 1]
    text2 = re.findall(r'>(.+?)</a>', str(info))
    info_dic['community'] = text2[0]
    info_dic['region'] = text2[1]
    info_dic['road'] = text2[2]
    if data.empty:
        data = pd.DataFrame(info_dic, index=[0])
    else:
        data = data.append(info_dic, ignore_index=True)

print(len(data))
data.to_csv('闵行.csv',sep=',',index=True)